Multimapping Analysis
Load packages
First we will load in the packages needed to execute this analysis
library(rmdformats)
library(tidyverse)
library(magrittr)
library(edgeR)
library(ggplot2)
library(viridis)
library(ggfortify)
library(PCAtools)
library(AnnotationHub)
library(ensembldb)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(grid)
library(gridExtra)
library(tibble)
library(EnsDb.Hsapiens.v86)
library(here)Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.
I have also defined several functions here for quality of life purposes
Multi mapping analysis
Each method handles multiple mapping reads differently -Salmon and SA will calculate the probability of a multiple mapped read to each of its mapped transcripts -Kallisto does not report multiple mapping pseudoalignments -Hisat2 will search for multiple alignments and only report the best one
Seems like it really depends on what you wanna do which will dictate the method you use
There are options in both Kallisto and Hisat2 that can handle multiple mappings differently I think
Hisat2’s handling may also be the reason why there are way less ambiguous reads in its run
Load the data
# From 01_Annotations
txp_gene_ensdb_lengths <- read_csv(
here("data/annotations/txp_gene_ensdb_lengths.csv.gz")
)
gene_txp_anno <- read_csv(here("data/annotations/grch38_103_df.csv.gz"))
transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
c("transcript_id" = tx_id,
"transcript_name" = tx_name))
# From 02_dtelist_prep
hisat2_dtelist <- read_rds(here("data/counts/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/counts/kallisto_dtelist.rds"))
salmon_dtelist <- read_rds(here("data/counts/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/counts/star_dtelist.rds"))
# From 04_principal_component_analysis
hisat2_loadings <- read_csv(
here("data/pca/loadings/hisat2_loadings.csv")
) %>%
head(200)
kallisto_loadings <- read_csv(
here("data/pca/loadings/kallisto_loadings.csv")
) %>%
head(200)
salmon_loadings <- read_csv(
here("data/pca/loadings/salmon_loadings.csv")
) %>%
head(200)
star_loadings <- read_csv(
here("data/pca/loadings/star_loadings.csv")
) %>%
head(200)
# From figure2_make.Rmd
hisat2_unique <- read_csv(here("data/unique_transcripts/hisat2_unique.csv.gz"))
kallisto_unique <- read_csv(
here("data/unique_transcripts/kallisto_unique.csv.gz")
)
salmon_unique <- read_csv(here("data/unique_transcripts/salmon_unique.csv.gz"))
star_unique <- read_csv(here("data/unique_transcripts/star_unique.csv.gz"))
# From 05_dte_and_dtu
## DTE
hisat2_dte_exclusive <- read_csv(
here("data/differential_expression/hisat2_exclusive_tx.csv")
) %>%
head(200)
kallisto_dte_exclusive <- read_csv(
here("data/differential_expression/kallisto_exclusive_tx.csv")
) %>%
head(200)
salmon_dte_exclusive <- read_csv(
here("data/differential_expression/salmon_exclusive_tx.csv")
)
star_dte_exclusive <- read_csv(
here("data/differential_expression/star_exclusive_tx.csv")
) %>%
head(200)
all_methods_tx_dte <- read_csv(
here("data/differential_expression/all_methods_tx.csv")
)
## DTU
hisat2_dtu_exclusive <- read_csv(
here("data/differential_usage/hisat2_exclusive_dtu.csv")
) %>%
dplyr::rename("transcript_id" = Transcript) %>%
dplyr::arrange(txpFDR) %>%
head(200)
kallisto_dtu_exclusive <- read_csv(
here("data/differential_usage/kallisto_exclusive_dtu.csv")
) %>%
dplyr::rename("transcript_id" = Transcript) %>%
dplyr::arrange(txpFDR) %>%
head(200)
star_dtu_exclusive <- read_csv(
here("data/differential_usage/star_exclusive_dtu.csv")
) %>%
dplyr::rename("transcript_id" = Transcript) %>%
dplyr::arrange(txpFDR) %>%
head(200)
salmon_dtu_exclusive <- read_csv(
here("data/differential_usage/salmon_exclusive_dtu.csv")
) %>%
dplyr::rename("transcript_id" = Transcript) %>%
dplyr::arrange(txpFDR) %>%
head(200)
all_methods_tx_dtu <- read_csv(
here("data/differential_usage/all_methods_tx.csv")
) %>%
dplyr::rename("transcript_id" = Transcript) %>%
dplyr::arrange(txpFDR) %>%
head(200)Get Multi-Mapping Data
Quant Files
hisat2_quant_files <- character()
for(i in list.files(here("data/counts/HISAT2/hisat2_quant"))) {
quant <- paste0(here("data/counts/HISAT2/hisat2_quant/"), i, "/quant.sf")
hisat2_quant_files <- c(hisat2_quant_files, quant)
}
sa_quant_files <- character()
for(i in list.files(here("data/counts/SA/quant_files"))) {
quant <- paste0(here("data/counts/SA/quant_files/"), i)
sa_quant_files <- c(sa_quant_files, quant)
}
star_quant_files <- character()
for(i in list.files(here("data/counts/STAR/quant_files"))) {
quant <- paste0(here("data/counts/STAR/quant_files/"), i)
star_quant_files <- c(star_quant_files, quant)
}Ambiguous Read Files
hisat2_ambig_files <- character()
for(i in list.files(here("data/counts/HISAT2/hisat2_quant"))) {
ambig <- paste0(here("data/counts/HISAT2/hisat2_quant/"), i,
"/aux_info/ambig_info.tsv")
hisat2_ambig_files <- c(hisat2_ambig_files, ambig)
}
sa_ambig_files <- character()
for(i in list.files(here("data/counts/SA/ambig_info"))) {
ambig <- paste0(here("data/counts/SA/ambig_info/"), i)
sa_ambig_files <- c(sa_ambig_files, ambig)
}
star_ambig_files <- character()
for(i in list.files(here("data/counts/STAR/ambig_info"))) {
ambig <- paste0(here("data/counts/STAR/ambig_info/"), i)
star_ambig_files <- c(star_ambig_files, ambig)
}Set multi-mapping info function
multi_mapping <- function(quant, ambig) {
x <- cbind(quant, ambig) %>%
mutate(sum = TPM + NumReads + UniqueCount + AmbigCount) %>%
dplyr::filter(sum > 0) %>%
dplyr::select(-sum) %>%
mutate(multimap = NumReads-UniqueCount) %>%
mutate(percent_est_from_multimap = multimap/NumReads,
percent_est_from_multimap = replace_na(percent_est_from_multimap, 0),
percent_ambigs_used = multimap/AmbigCount,
percent_ambigs_used = replace_na(percent_ambigs_used, 0),
status = case_when(
UniqueCount == NumReads & UniqueCount > 0 ~ "all_unique_reads",
UniqueCount == 0 & NumReads == 0 & AmbigCount > 0 ~ "all_ambig_no_counts",
multimap == NumReads ~ "all_ambig_reads",
AmbigCount != NumReads & UniqueCount != NumReads & percent_est_from_multimap > 0.5 ~ "more_multi",
AmbigCount != NumReads & UniqueCount != NumReads & percent_est_from_multimap < 0.5 ~ "more_unique",
percent_est_from_multimap == 0.5 ~ "equal"
),
status = factor(status, levels = c("all_unique_reads",
"more_unique",
"equal",
"more_multi",
"all_ambig_reads")))
return(x)
}Get Multi-mapping for each Aligner
HISAT2
hisat2_quants <- list()
for(i in 1:length(hisat2_quant_files)) {
hisat2_quants[[i]] <- read_delim(
hisat2_quant_files[[i]]
)
}
names(hisat2_quants) <- list.files(here("data/counts/HISAT2/hisat2_quant"))
hisat2_ambigs <- list()
for(i in 1:length(hisat2_ambig_files)) {
hisat2_ambigs[[i]] <- read_tsv(
hisat2_ambig_files[[i]]
)
}
names(hisat2_ambigs) <- list.files(here("data/counts/HISAT2/hisat2_quant"))
hisat2_multi_mapping <- list()
for(i in 1:length(hisat2_quants)) {
hisat2_multi_mapping[[i]] <- multi_mapping(quant = hisat2_quants[[i]],
ambig = hisat2_ambigs[[i]])
}
names(hisat2_multi_mapping) <- names(hisat2_quants)Selective Alignment
sa_quants <- list()
for(i in 1:length(sa_quant_files)) {
sa_quants[[i]] <- read_delim(
sa_quant_files[[i]]
)
}
names(sa_quants) <- basename(sa_quant_files) %>%
str_remove("quant_") %>%
str_remove(".sf")
sa_ambigs <- list()
for(i in 1:length(sa_ambig_files)) {
sa_ambigs[[i]] <- read_tsv(
sa_ambig_files[[i]]
)
}
names(sa_ambigs) <- basename(sa_ambig_files) %>%
str_remove("_ambig_info") %>%
str_remove(".tsv")
sa_multi_mapping <- list()
for(i in 1:length(sa_quants)) {
sa_multi_mapping[[i]] <- multi_mapping(quant = sa_quants[[i]],
ambig = sa_ambigs[[i]])
}
names(sa_multi_mapping) <- names(sa_quants)STAR
star_quants <- list()
for(i in 1:length(star_quant_files)) {
star_quants[[i]] <- read_delim(
star_quant_files[[i]]
)
}
names(star_quants) <- basename(star_quant_files) %>%
str_remove("quant_") %>%
str_remove(".sf")
star_ambigs <- list()
for(i in 1:length(star_ambig_files)) {
star_ambigs[[i]] <- read_tsv(
star_ambig_files[[i]]
)
}
names(star_ambigs) <- basename(star_ambig_files) %>%
str_remove("_ambig_info") %>%
str_remove(".tsv")
star_multi_mapping <- list()
for(i in 1:length(star_quants)) {
star_multi_mapping[[i]] <- multi_mapping(quant = star_quants[[i]],
ambig = star_ambigs[[i]])
}
names(star_multi_mapping) <- names(star_quants)Determine STAR ambiguous reads after Salmon
mm_rate <- numeric()
for(i in names(star_multi_mapping)) {
sum_mm <- sum(star_multi_mapping[[i]][["multimap"]])
sum_un <- sum(star_multi_mapping[[i]][["UniqueCount"]])
rate <- sum_mm/(sum_mm+sum_un)
mm_rate <- c(mm_rate, rate)
}
multi_nms <- c("SRR13401116", "SRR13401117", "SRR13401118", "SRR13401119",
"SRR13401120", "SRR13401121", "SRR13401122", "SRR13401123")
mm_rate_df <- data.frame(multi_nms, mm_rate)
mean(mm_rate_df$mm_rate) # ~77.8%## [1] 0.7775229
STAR numbers directly from log files
star_multimap <- c(8.14+0.08, 9.25+0.08, 9.98+0.1, 8.19+0.08,
9.6+0.08, 9.04+0.08, 8.06+0.08, 7.94+0.08)
multi_nms <- c("SRR13401116", "SRR13401117", "SRR13401118", "SRR13401119",
"SRR13401120", "SRR13401121", "SRR13401122", "SRR13401123")
star_multimap_rates <- data.frame(multi_nms, star_multimap) %>%
set_colnames(c("sample_id", "multimap_rate"))
mean(star_multimap_rates$multimap_rate)## [1] 8.8575
Create joined data frames
joined_df <- get_joined_samples(w = hisat2_dtelist,
x = kallisto_dtelist,
y = star_dtelist,
z = salmon_dtelist,
method_w = "Hisat2",
method_x = "Kallisto",
method_y = "STAR",
method_z = "Salmon",
sample = 1)
cor_df <- joined_df
cor_df$variance <- apply(cor_df[,-1], 1, var)lowcor_ensdb <- cor_df %>%
dplyr::arrange(desc(variance)) %>%
head(200) %>%
dplyr::select(transcript_id) %>%
dplyr::left_join(gene_txp_anno,
by = "transcript_id")
lowcor_ensdb <- lowcor_ensdb %>%
dplyr::select(-score,
-phase) %>%
select_if(~ !all(is.na(.))) %>%
dplyr::select(-ccds_id,
-transcript_support_level,
-tag) %>%
drop_na()Create Multi-Mapping Group Lists
highcor_ensdb <- cor_df %>%
dplyr::arrange(variance) %>%
head(200) %>%
dplyr::select(transcript_id) %>%
dplyr::left_join(gene_txp_anno,
by = "transcript_id")
highcor_ensdb <- highcor_ensdb %>%
dplyr::select(-score,
-phase) %>%
select_if(~ !all(is.na(.))) %>%
dplyr::select(-ccds_id,
-transcript_support_level,
-tag) %>%
drop_na()group_list <- list()
group_list[["highcor"]] <- highcor_ensdb
group_list[["lowcor"]] <- lowcor_ensdb
group_list[["hisat2"]] <- hisat2_loadings
group_list[["kallisto"]] <- kallisto_loadings
group_list[["star"]] <- star_loadings
group_list[["salmon"]] <- salmon_loadings
group_list[["hisat2_unique"]] <- hisat2_unique
group_list[["kallisto_unique"]] <- kallisto_unique
group_list[["star_unique"]] <- star_unique
group_list[["salmon_unique"]] <- salmon_unique
group_list[["hisat2_dte"]] <- hisat2_dte_exclusive
group_list[["salmon_dte"]] <- salmon_dte_exclusive
group_list[["kallisto_dte"]] <- kallisto_dte_exclusive
group_list[["star_dte"]] <- star_dte_exclusive
#group_list[["all_methods_dte"]] <- all_methods_tx_dte
group_list[["hisat2_dtu"]] <- hisat2_dtu_exclusive
group_list[["kallisto_dtu"]] <- kallisto_dtu_exclusive
group_list[["star_dtu"]] <- star_dtu_exclusive
group_list[["salmon_dtu"]] <- salmon_dtu_exclusive
#group_list[["all_methods_dtu"]] <- all_methods_tx_dtuMulti mapping plots
Prep for multi-map
group_name <- names(group_list)
method_name <- names(multimap_list)
samp_nms <- names(multimap_list$HISAT2)
multimap_group_list <- list()
for(i in group_name) {
for(j in method_name) {
for(k in samp_nms) {
multimap_group_list[[i]][[j]][[k]] <- multimap_list[[j]][[k]] %>%
dplyr::mutate(transcript_id = str_remove(Name, "\\..*")) %>%
dplyr::filter(transcript_id %in% group_list[[i]]$transcript_id)
}
}
}Percentage of Multi-Mapping per Sample
for(i in group_name) {
for(j in samp_nms) {
message("Starting group: ", i," sample: ", j, " plot")
multi_map_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
ggplot(aes(x = group, y = percent_est_from_multimap)) +
geom_quasirandom() +
ggtitle(paste0("Sample: ", j)) +
labs(x = "",
y = "Multi-Mapping Reads (%)") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 15,
face = "bold"))
ggsave(plot = multi_map_plot,
filename = paste0(j, "_percent_mm.png"),
path = here(paste0("figures/multi_map/", i, "/percent_mm/")),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}
}Number of Reads by Method
for(i in group_name) {
for(j in samp_nms) {
message("Starting group: ", i," sample: ", j, " plot")
num_reads_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
ggplot(aes(x = group, y = NumReads)) +
geom_quasirandom() +
ggtitle(paste0("Sample: ", i)) +
labs(x = "",
y = "Number of Reads") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 15,
face = "bold"))
ggsave(plot = num_reads_plot,
filename = paste0(j, "_numreads.png"),
path = here(paste0("figures/multi_map/", i, "/number_of_reads/")),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}
}Counts by Method
for(i in group_name) {
for(j in samp_nms) {
message("Starting group: ", i," sample: ", j, " plot")
ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::select(group, UniqueCount, AmbigCount) %>%
pivot_longer(cols = c("UniqueCount", "AmbigCount"),
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(type = case_when(
type == "AmbigCount" ~ "Multi-Mapped",
type == "UniqueCount" ~ "Uniquely Mapped"),
type = factor(type, levels = c("Uniquely Mapped", "Multi-Mapped"))) %>%
dplyr::mutate(counts = counts + 1) %>%
ggplot(aes(x = type, y = log2(counts))) +
geom_quasirandom(colour = "darkorange") +
geom_boxplot(fill = "blue", colour = "black", width = 0.2,
outlier.shape = NA) +
#scale_colour_manual(values = c("darkorange", "darkblue")) +
ggtitle(paste0("Sample: ", j)) +
labs(y = "Number of Reads (log2)",
x = "",
colour = "") +
facet_wrap(~ group) +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14))
ggsave(plot = ambig_counts_plot,
filename = paste0(j, "_mm_counts.png"),
path = here(paste0("figures/multi_map/", i, "/counts_mm/")),
height = 200,
width = 300,
dpi = 400,
units = "mm", create.dir = TRUE)
}
}Ambig Ratios by Method
for(i in group_name) {
for(j in samp_nms) {
message("Starting group: ", i," sample: ", j, " plot")
ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::select(group, UniqueCount, AmbigCount) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
category = case_when(
ratio > 0 ~ "More Unique Mappings",
.default = "More Multiple Mappings"
),
category = factor(category,
levels = c("More Unique Mappings",
"More Multiple Mappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "lightblue", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
x = "",
colour = "") +
#facet_wrap(~ group) +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 12),
axis.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14))
ggsave(plot = ambig_counts_plot,
filename = paste0(j, "_mm_ratios.png"),
path = here(paste0("figures/multi_map/", i, "/ratios_mm/")),
height = 200,
width = 300,
dpi = 400,
units = "mm", create.dir = TRUE)
}
}
for(j in samp_nms) {
ratio_df <- data.frame()
for(i in group_name[3:6]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, AmbigCount) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
))
ratio_df <- rbind(ratio_df, ratio_mm)
}
ratio_plot <- ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
x = "",
colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_text(colour = "black", size = 18),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
ggsave(plot = ratio_plot,
filename = paste0(j, "_mm_ratios.png"),
path = here(paste0("figures/multi_map/all_groups_ratio/")),
height = 280,
width = 420,
dpi = 400,
units = "mm", create.dir = TRUE)
}for(j in samp_nms) {
# First do concordant and discordant
cd_ratio_df <- data.frame()
for(i in group_name[1:2]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, AmbigCount) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
))
cd_ratio_df <- rbind(cd_ratio_df, ratio_mm)
}
cd_ratio_plot <- cd_ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 2) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_blank(),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.position = "none",
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
# Then do PC loadings
pc_ratio_df <- data.frame()
for(i in group_name[3:6]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, AmbigCount) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
))
pc_ratio_df <- rbind(pc_ratio_df, ratio_mm)
}
pc_ratio_plot <- pc_ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
#ggtitle(paste0("Sample: ", j)) +
labs(colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_blank(),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
fig_leg <- cowplot::get_legend(pc_ratio_plot)
ylab <- textGrob("Log2 Ratio (Unique mappings/Multiple mappings)",
gp = gpar(fontface = "bold",
col = "black",
fontsize = 20),
rot = 90)
plot_joined <- cowplot::plot_grid(
cowplot::plot_grid(cd_ratio_plot,
pc_ratio_plot + theme(legend.position = "none"),
ncol = 1,
labels = c("A)", "B)")),
fig_leg,
nrow = 1,
rel_widths = c(1, 0.15))
fig_5 <- grid.arrange(arrangeGrob(plot_joined, left = ylab))
ggsave(plot = fig_5,
filename = paste0(j, "_ratios_ambig_fig5.png"),
path = here(paste0("figures/chapter_figures/figure5_ratios")),
height = 340,
width = 420,
dpi = 400,
units = "mm", create.dir = TRUE)
}Multi Ratios by Method
for(i in group_name) {
for(j in samp_nms) {
message("Starting group: ", i," sample: ", j, " plot")
ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::select(group, UniqueCount, multimap) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
category = case_when(
ratio > 0 ~ "More Unique Mappings",
.default = "More Multiple Mappings"
),
category = factor(category,
levels = c("More Unique Mappings",
"More Multiple Mappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "lightblue", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
x = "",
colour = "") +
#facet_wrap(~ group) +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 12),
axis.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14))
ggsave(plot = ambig_counts_plot,
filename = paste0(j, "_mm_ratios_multi.png"),
path = here(paste0("figures/multi_map/", i, "/ratios_mm/")),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}
}
for(j in samp_nms) {
ratio_df <- data.frame()
for(i in group_name[3:6]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, multimap) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
),
tx_group = factor(tx_group,
levels = c("Concordant Transcripts",
"Discordant Transcripts",
"Top Negative\nPC1 Loadings",
"Top Positive\nPC2 Loadings",
"Top Negative\nPC2 Loadings",
"Top Negative\nPC5 Loadings"))) %>%
dplyr::filter(!counts == 0)
ratio_df <- rbind(ratio_df, ratio_mm)
}
ratio_plot <- ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
x = "",
colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_text(colour = "black", size = 18),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
ggsave(plot = ratio_plot,
filename = paste0(j, "_mm_multi_ratios.png"),
path = here(paste0("figures/multi_map/all_groups_ratio/")),
height = 280,
width = 420,
dpi = 400,
units = "mm", create.dir = TRUE)
}for(j in samp_nms) {
# First do concordant and discordant
cd_ratio_df <- data.frame()
for(i in group_name[1:2]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, multimap) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
),
tx_group = factor(tx_group,
levels = c("Concordant Transcripts",
"Discordant Transcripts",
"Top Negative\nPC1 Loadings",
"Top Positive\nPC2 Loadings",
"Top Negative\nPC2 Loadings",
"Top Negative\nPC5 Loadings"))) %>%
dplyr::filter(!counts == 0)
cd_ratio_df <- rbind(cd_ratio_df, ratio_mm)
}
cd_ratio_plot <- cd_ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
ggtitle(paste0("Sample: ", j)) +
labs(colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 2) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_blank(),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.position = "none",
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
# Then do PC loadings
pc_ratio_df <- data.frame()
for(i in group_name[3:6]) {
message("Starting group: ", i," sample: ", j, " plot")
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, multimap) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(tx_group = case_when(
i == "highcor" ~ "Concordant Transcripts",
i == "lowcor" ~ "Discordant Transcripts",
i == "hisat2" ~ "Top Negative\nPC1 Loadings",
i == "kallisto" ~ "Top Negative\nPC2 Loadings",
i == "star" ~ "Top Positive\nPC2 Loadings",
i == "salmon" ~ "Top Negative\nPC5 Loadings"
),
tx_group = factor(tx_group, levels = c("Concordant Transcripts",
"Discordant Transcripts",
"Top Negative\nPC1 Loadings",
"Top Positive\nPC2 Loadings",
"Top Negative\nPC2 Loadings",
"Top Negative\nPC5 Loadings")))
pc_ratio_df <- rbind(pc_ratio_df, ratio_mm)
}
pc_ratio_plot <- pc_ratio_df %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom() +
geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
outlier.shape = NA) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
#ggtitle(paste0("Sample: ", j)) +
labs(colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_blank(),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
fig_leg <- cowplot::get_legend(pc_ratio_plot)
ylab <- textGrob("Log2 Ratio (Unique mappings/Multiple mappings)",
gp = gpar(fontface = "bold",
col = "black",
fontsize = 20),
rot = 90)
plot_joined <- cowplot::plot_grid(
cowplot::plot_grid(cd_ratio_plot,
pc_ratio_plot + theme(legend.position = "none"),
ncol = 1,
labels = c("A)", "B)")),
fig_leg,
nrow = 1,
rel_widths = c(1, 0.15))
fig_5 <- grid.arrange(arrangeGrob(plot_joined, left = ylab))
ggsave(plot = fig_5,
filename = paste0(j, "_ratios_multi_fig5.png"),
path = here(paste0("figures/chapter_figures/figure5_ratios")),
height = 340,
width = 420,
dpi = 400,
units = "mm", create.dir = TRUE)
}Figure 5 Ratios
ratio_df_fig <- data.frame()
cor_groups <- c("highcor", "lowcor")
for(i in cor_groups) {
for(j in samp_nms) {
ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate(group = factor(group,
levels = c("HISAT2", "Salmon", "STAR"))) %>%
dplyr::select(group, UniqueCount, multimap) %>%
dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
category = case_when(
ratio > 0 ~ "More Unique\nMappings",
.default = "More Multiple\nMappings"
),
category = factor(category,
levels = c("More Unique\nMappings",
"More Multiple\nMappings")
)) %>%
pivot_longer(cols = "ratio",
names_to = "type",
values_to = "counts") %>%
dplyr::mutate(sample = j,
tx_group = i)
ratio_df_fig <- rbind(ratio_df_fig, ratio_mm)
}
}
figure5_mm <- ratio_df_fig %>%
dplyr::mutate(tx_group = case_when(
tx_group == "highcor" ~ "Concordant Transcripts",
tx_group == "lowcor" ~ "Discordant Transcripts")) %>%
ggplot(aes(x = group, y = counts, colour = category)) +
geom_quasirandom(size = 1) +
scale_y_continuous(limits = c(-20, 20)) +
scale_colour_manual(values = c("darkblue", "darkorange")) +
#ggtitle(paste0("Sample: ", j)) +
labs(colour = "Category") +
guides(colour = guide_legend(byrow = TRUE)) +
facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
theme_bw() +
theme(plot.title = element_text(colour = "black", size = 18),
axis.title = element_blank(),
axis.text = element_text(colour = "black", size = 16),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 18),
legend.text = element_text(colour = "black", size = 16),
legend.title = element_text(colour = "black", size = 18),
legend.spacing.y = unit(0.5, "cm"))
ggsave(plot = figure5_mm,
filename = "figure5_multimapping.png",
path = here("figures/chapter_figures"),
height = 225,
width = 300,
dpi = 400,
units = "mm", create.dir = TRUE)Fig 5 Ratio investigate
hisat2_con_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Concordant Transcripts", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_con_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1586.075 -4.347 1.877 -18.064 11.276 144.881
salmon_con_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Concordant Transcripts", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_con_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4296.930 -7.558 1.486 -99.190 8.400 167.975
star_con_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Concordant Transcripts", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_con_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4047.655 -4.370 1.882 396.379 11.644 10614.000
hisat2_dis_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Discordant Transcripts", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_dis_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5021.96 -232.51 -88.19 -187.67 -32.87 130.19
salmon_dis_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Discordant Transcripts", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_dis_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9411.03 -480.89 -259.51 -447.38 -66.95 413.15
star_dis_ratios <- cd_ratio_df %>%
dplyr::filter(tx_group == "Discordant Transcripts", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_dis_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12259.32 -364.95 -161.24 -322.69 -35.62 1389.00
hisat2_pc1_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_pc1_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5021.961 -138.875 -24.714 -139.224 -3.122 9.406
salmon_pc1_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_pc1_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4250.183 -121.696 -20.121 -157.552 -3.811 5.053
star_pc1_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_pc1_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12259.315 -30.072 -11.430 -147.140 -2.443 10.811
hisat2_pc2_pos_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_pc2_pos_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5021.96 -80.56 -21.80 -145.65 -6.10 90.43
salmon_pc2_pos_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_pc2_pos_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4497.835 -374.457 -86.317 -377.990 -8.501 9.471
star_pc2_pos_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_pc2_pos_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -42026.59 -356.28 -19.24 -633.26 -3.51 2334.00
hisat2_pc2_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_pc2_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1419.73 -226.90 -71.67 -187.14 -27.06 130.19
salmon_pc2_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_pc2_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -19947.10 -580.14 -260.27 -633.40 -70.01 413.15
star_pc2_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_pc2_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1226.747 -258.247 -69.892 -172.587 -7.547 528.000
hisat2_pc5_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "HISAT2") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(hisat2_pc5_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1383.40 -232.06 -97.52 -170.73 -41.25 90.43
salmon_pc5_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "Salmon") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(salmon_pc5_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3524.15 -503.76 -287.87 -405.40 -127.23 88.05
star_pc5_neg_ratios <- pc_ratio_df %>%
dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "STAR") %>%
mutate(times_higher = 2^counts,
times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
.default = times_higher))
summary(star_pc5_neg_ratios$times_higher)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1986.26 -355.03 -170.33 -258.90 -63.08 611.00
Categories
Start comparing multimapping rates between methods
for(k in group_name) {
message("Starting group ", k)
for(j in samp_nms) {
message("Starting Sample: ", j)
mm_df <- data.frame()
for(i in method_name) {
message("Starting Method ", i)
statuses_df <- multimap_group_list[[k]][[i]][[j]] %>%
dplyr::filter(!is.na(status)) %>%
dplyr::mutate(status = str_to_title(
str_replace_all(string = status,
pattern = "_",
replacement = " "),),
status = str_replace(status, "Multi", "Ambig"),
status = factor(status, levels = c("All Unique Reads",
"More Unique",
"Equal",
"More Ambig",
"All Ambig Reads"))) %>%
dplyr::mutate(method = i)
mm_df <- rbind(mm_df, statuses_df)
}
statuses_plot <- mm_df %>%
ggplot(aes(x = status, fill = status)) +
geom_bar(colour = "black") +
labs(y = "Frequency",
x = "",
fill = "Degree of\nMulti-Mapping") +
ggtitle(paste0("Sample: ", j)) +
scale_fill_viridis_d() +
facet_grid(~ method) +
theme_bw() +
theme(
axis.title = element_text(colour = "black", size = 14),
axis.text.y = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 14, face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14, face = "bold"),
legend.title = element_text(colour = "black",
size = 12, face = "bold")
)
tryCatch({
ggsave(plot = statuses_plot,
filename = paste0(j, "_mapping_categories.png"),
path = here(paste0("figures/multi_map/", k, "/categories/")),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}, error = function(e) {
cat("No data found for iteration", k, ":", j)
})
}
}## No data found for iteration kallisto_unique : SRR13401117
## No data found for iteration kallisto_unique : SRR13401118
## No data found for iteration kallisto_unique : SRR13401119
## No data found for iteration kallisto_unique : SRR13401120
## No data found for iteration kallisto_unique : SRR13401121
## No data found for iteration kallisto_unique : SRR13401122
Percentage ambig
for(k in group_name) {
for(i in method_name) {
for(j in samp_nms) {
message("Starting Group ", k, ", Method ", i, ", Sample: ", j, " plot")
percent_numreads <- multimap_group_list[[k]][[i]][[j]] %>%
ggplot(aes(x = NumReads, y = 100*percent_est_from_multimap)) +
geom_point(size = 3) +
ggtitle(paste0("Method: ", i, ", Sample: ", j)) +
labs(x = "Number of Reads",
y = "Reads that Multi-Mapped (%)") +
theme_bw() +
theme(
axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 14,
face = "bold")
)
tryCatch({
ggsave(plot = percent_numreads,
filename = paste0(j, "_percent_numreads.png"),
path = paste0(here("figures/multi_map/",
k,
"/percent_numreads/"),
tolower(i), "/"),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}, error = function(e) {
cat("No data found for iteration", k, ":", j)
})
}
}
}
for(i in group_name) {
for(j in samp_nms) {
message("Starting Group ", i, ", Sample: ", j, " plot")
percent_numreads <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
ggplot(aes(x = NumReads, y = 100*percent_est_from_multimap)) +
geom_point(size = 2) +
ggtitle(paste0("Sample: ", j)) +
labs(x = "Number of Reads",
y = "Reads that Multi-Mapped (%)") +
facet_grid(~ group) +
theme_bw() +
theme(
axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 14,
face = "bold"),
strip.background = element_rect(fill = "lightblue")
)
tryCatch({
ggsave(plot = percent_numreads,
filename = paste0(j, "_percent_numreads.png"),
path = paste0(here("figures/multi_map/",
i,
"/percent_numreads/")),
height = 100,
width = 250,
dpi = 400,
units = "mm", create.dir = TRUE)
}, error = function(e) {
cat("No data found for iteration", i, ":", j)
})
}
}Variance Percent
var_df <- read_csv(here("data/counts/variance_df.csv"))
for(k in group_name) {
for(i in method_name) {
for(j in samp_nms) {
message("Starting Group ", k, ", Method ", i, ", Sample: ", j, " plot")
var_percent <- multimap_group_list[[k]][[i]][[j]] %>%
dplyr::mutate("transcript_id" = str_remove(Name, "\\..*")) %>%
inner_join(var_df,
by = "transcript_id") %>%
dplyr::arrange(desc(rowvar)) %>%
ggplot(aes(x = rowvar, y = 100*percent_est_from_multimap)) +
geom_point() +
labs(x = "Transcript Count Variance",
y = "Reads that Multi-Mapped (%)") +
theme_bw()
tryCatch({
ggsave(plot = var_percent,
filename = paste0(j, "_var_percent.png"),
path = paste0(here("figures/multi_map/", k, "/var_percent/"),
tolower(i), "/"),
height = 150,
width = 175,
dpi = 400,
units = "mm", create.dir = TRUE)
}, error = function(e) {
cat("No data found for iteration", k, ":", i, "-", j)
})
}
}
}
for(i in group_name) {
for(j in samp_nms) {
message("Starting Group ", i, ", Sample: ", j, " plot")
var_percent <- multimap_group_list[[i]][["STAR"]][[j]] %>%
dplyr::mutate(group = "STAR") %>%
rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
dplyr::mutate(group = "Salmon")) %>%
rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
dplyr::mutate(group = "HISAT2")) %>%
dplyr::mutate("transcript_id" = str_remove(Name, "\\..*")) %>%
inner_join(var_df,
by = "transcript_id") %>%
dplyr::arrange(desc(rowvar)) %>%
ggplot(aes(x = rowvar, y = 100*percent_est_from_multimap)) +
geom_point(size = 2) +
facet_grid(~ group) +
ggtitle(paste0("Sample: ", j)) +
labs(x = "Transcript Count Variance",
y = "Reads that Multi-Mapped (%)") +
theme_bw() +
theme(
axis.title = element_text(colour = "black", size = 14),
axis.text = element_text(colour = "black", size = 12),
plot.title = element_text(colour = "black", size = 14,
face = "bold"),
strip.background = element_rect(fill = "lightblue")
)
tryCatch({
ggsave(plot = var_percent,
filename = paste0(j, "_var_percent.png"),
path = paste0(here("figures/multi_map/", i, "/var_percent/")),
height = 100,
width = 205,
dpi = 400,
units = "mm", create.dir = TRUE)
}, error = function(e) {
cat("No data found for iteration", i, ":", j)
})
}
}## No data found for iteration hisat2_unique : SRR13401116
## No data found for iteration hisat2_unique : SRR13401117
## No data found for iteration hisat2_unique : SRR13401118
## No data found for iteration hisat2_unique : SRR13401119
## No data found for iteration hisat2_unique : SRR13401120
## No data found for iteration hisat2_unique : SRR13401121
## No data found for iteration hisat2_unique : SRR13401122
## No data found for iteration hisat2_unique : SRR13401123
## No data found for iteration kallisto_unique : SRR13401116
## No data found for iteration kallisto_unique : SRR13401117
## No data found for iteration kallisto_unique : SRR13401118
## No data found for iteration kallisto_unique : SRR13401119
## No data found for iteration kallisto_unique : SRR13401120
## No data found for iteration kallisto_unique : SRR13401121
## No data found for iteration kallisto_unique : SRR13401122
## No data found for iteration kallisto_unique : SRR13401123
## No data found for iteration star_unique : SRR13401116
## No data found for iteration star_unique : SRR13401117
## No data found for iteration star_unique : SRR13401118
## No data found for iteration star_unique : SRR13401119
## No data found for iteration star_unique : SRR13401120
## No data found for iteration star_unique : SRR13401121
## No data found for iteration star_unique : SRR13401122
## No data found for iteration star_unique : SRR13401123
## No data found for iteration salmon_unique : SRR13401116
## No data found for iteration salmon_unique : SRR13401117
## No data found for iteration salmon_unique : SRR13401118
## No data found for iteration salmon_unique : SRR13401119
## No data found for iteration salmon_unique : SRR13401120
## No data found for iteration salmon_unique : SRR13401121
## No data found for iteration salmon_unique : SRR13401122
## No data found for iteration salmon_unique : SRR13401123
Number of transcripts
star_multitx <- star_multi_mapping[[1]] %>%
dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE,
AmbigCount == 0 ~ FALSE))
star_multitx$has_multi %>% table()## .
## FALSE TRUE
## 9253 153101
salmon_multitx <- sa_multi_mapping[[1]] %>%
dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE,
AmbigCount == 0 ~ FALSE))
salmon_multitx$has_multi %>% table()## .
## FALSE TRUE
## 5949 172697
hisat2_multitx <- hisat2_multi_mapping[[1]] %>%
dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE,
AmbigCount == 0 ~ FALSE))
hisat2_multitx$has_multi %>% table()## .
## FALSE TRUE
## 7830 169044